# -*- coding: utf-8 -*-
## Code by Nick Eubank

## This file flattens a nested dictionary of textual data from village assemblies recordings into a .csv for use in R.  

## Village assembly transcripts collected as part of an impact evaluation of the Pudhu Vaazhvu Project, a poverty alleviation and livelihoods program implemented by the World Bank and Government of Tamil Nadu. Proceedings were reorded in a matched sample of 50 treatment and 50 control villages in on Republic Day, in January 2014. Recordings were then transcribed into Tamil, and then manually translated into English by a team from our survey firm.

## Transcripts were cleaned manually, formatted into a nested dictionary to extract relevant meta-data (geographic identifiers, gender and position of the speaker, etc.), and exported to a flat .csv file. See Transcripts_DTM.py for this code.



##### Import Packages
import pandas as pd
import os
import pickle
import re



## Set Working Dictionary
wd = "/Users/rmparthasarathy/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/"
os.chdir(wd)


## Load Nested dictionary
f = open('transcript_corpus_v2.pkl','rb')
master_dict = pickle.load(f) 

# Gather into a list of dataframes (one per town)

list_of_town_dataframes = list()

# Loop
for town in master_dict.keys():

    # Convert speechs to dataframes
    list_of_dataframes = list()

    for speech in master_dict[town].keys():
        if re.match('speech.*', speech):
            print(speech)
            entry = pd.DataFrame(master_dict[town][speech], index=[0])
            entry['speech_id'] = speech
            list_of_dataframes.append(entry)
    
    town_dataframe = pd.concat(list_of_dataframes)

    # Add in meta-data
    for i in ['block', 'projectstatus', 'district']:
        town_dataframe[i] = master_dict[town][i]
    
    town_dataframe['village'] = town

    # Tack to end of list 
    list_of_town_dataframes.append(town_dataframe)

# Combine from each list

final = pd.concat(list_of_town_dataframes)

# check
print final.head()


final.to_csv('gs_speeches2.csv')





#########################################################################
## Revised dictionary
#########################################################################


## Set Working Dictionary
wd = "/Users/rmparthasarathy/Dropbox/wb-pvp/GS_Stata/Transcript_Analysis/"
os.chdir(wd)


## Load Nested dictionary
f = open('transcript_corpus_v1.pkl','rb')

master_dict = pickle.load(f) 

# Gather into a list of dataframes (one per town)

list_of_town_dataframes = list()

# Loop
for town in master_dict.keys():

    # Convert speechs to dataframes
    list_of_dataframes = list()

    for speech in master_dict[town].keys():
        if re.match('speech.*', speech):
            print(speech)
            entry = pd.DataFrame(master_dict[town][speech], index=[0])
            entry['speech_id'] = speech
            list_of_dataframes.append(entry)

    town_dataframe = pd.concat(list_of_dataframes)

    # Add in meta-data
    for i in ['block', 'projectstatus', 'district']:
        town_dataframe[i] = master_dict[town][i]
    
    town_dataframe['village'] = town

    # Tack to end of list 
    list_of_town_dataframes.append(town_dataframe)

# Combine from each list

final = pd.concat(list_of_town_dataframes)

# check
print final.head()


final.to_csv('gs_speeches1.csv')
